################################################################################
# Ingroup vs Intragroup Annotation Quality
################################################################################
################################################################################
# Dependencies
################################################################################
library(dplyr)
library(tidyr)
library(readr)
library(stringr)
library(lubridate)
library(ggplot2)
library(ggthemes)
library(scales)
library(irr)
library(tidycomm)
library(forestmangr)
################################################################################
# Load Data 
################################################################################
rm(list=ls())
# - set dir
args = commandArgs()

scriptName = args[substr(args,1,7) == '--file=']

if (length(scriptName) == 0) {
  scriptName <- rstudioapi::getSourceEditorContext()$path
} else {
  scriptName <- substr(scriptName, 8, nchar(scriptName))
}

pathName = substr(
  scriptName, 
  1, 
  nchar(scriptName) - nchar(strsplit(scriptName, '.*[/|\\]')[[1]][2])
)

setwd(pathName)
################################################################################
# Load all annotation Sets
################################################################################
org_df <- read_csv("../data/citizen_science_for_annots_27012022.csv")

df1 <- read_csv("../annotations/chatGPT/output_zero_shot/label_hatespeech_round_sensitivity_check_1.csv")
df2 <- read_csv("../annotations/chatGPT/output_zero_shot/label_hatespeech_round_sensitivity_check_2.csv")
df3 <- read_csv("../annotations/chatGPT/output_zero_shot/label_hatespeech_round_sensitivity_check_3.csv")

################################################################################
# Transform into one dataframe 
################################################################################

safe_parse_json <- function(json_str) {
  # Ensure correct escaping of inner quotes
  json_str <- str_replace_all(json_str, '(?<!\\\\)"', '\\"') 
  tryCatch(fromJSON(json_str), error = function(e) return(list(Label = NA, reason = NA)))
}

for(i in 1:3){
  if(i == 1){
    tmp <- df1
  } 
  
  if(i == 2){
    tmp <- df2
  }
  
  if(i == 3){
    tmp <- df3
    }
  
  tmp <- tmp %>% mutate(label = gsub("\\`\\`\\`|json", "", label))
  
  tmp <- tmp %>%
    rowwise() %>%
    mutate(parsed = list(safe_parse_json(label))) %>%
    unnest_wider(parsed) %>% 
    dplyr::mutate(Group = "chatGPT",
                  Annotator = i)
  
  tmp <- tmp %>% dplyr::mutate(IsHateSpeech = ifelse(Label %in% c("HATE SPEECH", "HATE SPEECH (1)"), 1, 0),
                               IsToxic = ifelse(Label %in% c("TOXIC", "TOXIC SPEECH", "TOXIC SPEECH (2)"), 1, 0)) %>% 
                 dplyr::select(-c(Label,reason,Kommentar))
  
  tmp <- dplyr::bind_cols(tmp,org_df)
  tmp <- tmp %>% dplyr::select(c(ArticleID,ID,Kommentar,Titel,Text,Group,Annotator,IsHateSpeech,IsToxic))
  if(i == 1){
    df1 <- tmp
  }
  
  if(i == 2){
    df2 <- tmp
  }
  
  if(i == 3){
    df3 <- tmp
  }
}


fin_df <- dplyr::bind_rows(df1,df2,df3)
fin_df %>% group_by(IsHateSpeech, IsToxic) %>% summarise(n = n())
fin_df %>% group_by(IsHateSpeech, IsToxic) %>% summarise(n = n())

write_csv(fin_df, "../annotations/chatGPT/chatgpt_set_zeroshot_sensitivity_check_for_analysis.csv")
